library(tidyverse)
library(janitor)
library(ggfortify)
library(GGally)
library(modelr)
prices <- read_csv("data/kc_house_data.csv")
Rows: 21613 Columns: 21── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): id
dbl (19): price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, view, condition, grade, sqft_abo...
dttm (1): date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Error in exists(cacheKey, where = .rs.WorkingDataEnv, inherits = FALSE) :
invalid first argument
Error in assign(cacheKey, frame, .rs.CachedDataEnv) :
attempt to use zero-length variable name
prices
glimpse(prices)
Rows: 21,613
Columns: 21
$ id <chr> "7129300520", "6414100192", "5631500400", "2487200875", "1954400510", "7237550310", "132140006…
$ date <dttm> 2014-10-13, 2014-12-09, 2015-02-25, 2014-12-09, 2015-02-18, 2014-05-12, 2014-06-27, 2015-01-1…
$ price <dbl> 221900, 538000, 180000, 604000, 510000, 1225000, 257500, 291850, 229500, 323000, 662500, 46800…
$ bedrooms <dbl> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3, 4, 2, 3, 4, 3, 5, 2, 3, 3, 3, 3, 3, 4, 3, 2…
$ bathrooms <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1.00, 2.50, 2.50, 1.00, 1.00, 1.75, 2.00, 3.00…
$ sqft_living <dbl> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1780, 1890, 3560, 1160, 1430, 1370, 1810, 2950,…
$ sqft_lot <dbl> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711, 7470, 6560, 9796, 6000, 19901, 9680, 4850, …
$ floors <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.5, 1.0, 1.5, 2.0, 2.0, 1.5, 1.0,…
$ waterfront <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ view <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ condition <dbl> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 4, 5, 3, 5, 3, 3, 3, 3…
$ grade <dbl> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, 7, 7, 7, 7, 7, 9, 8, 7, 8, 6, 8, 8, 7, 8, 8, …
$ sqft_above <dbl> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1050, 1890, 1860, 860, 1430, 1370, 1810, 1980, …
$ sqft_basement <dbl> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300, 0, 0, 0, 970, 0, 0, 0, 0, 760, 720, 0, 0, 0,…
$ yr_built <dbl> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, 1960, 2003, 1965, 1942, 1927, 1977, 1900, 1979…
$ yr_renovated <dbl> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ zipcode <dbl> 98178, 98125, 98028, 98136, 98074, 98053, 98003, 98198, 98146, 98038, 98007, 98115, 98028, 980…
$ lat <dbl> 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6561, 47.3097, 47.4095, 47.5123, 47.3684, 47.6…
$ long <dbl> -122.257, -122.319, -122.233, -122.393, -122.045, -122.005, -122.327, -122.315, -122.337, -122…
$ sqft_living15 <dbl> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1780, 2390, 2210, 1330, 1780, 1370, 1360, 2140…
$ sqft_lot15 <dbl> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711, 8113, 7570, 8925, 6000, 12697, 10208, 4850, …
skimr::skim(prices)
── Data Summary ────────────────────────
Values
Name prices
Number of rows 21613
Number of columns 21
_______________________
Column type frequency:
character 1
numeric 19
POSIXct 1
________________________
Group variables None
prices_clean <- prices %>%
select(-zipcode, -sqft_living15, -sqft_lot15, -id, -date) %>%
mutate(waterfront = as.logical(waterfront),
view = as.factor(view),
condition = as.factor(condition),
renovated = case_when(
yr_renovated > 0 ~ TRUE,
.default = FALSE
),
basement = case_when(
sqft_basement > 0 ~ TRUE,
.default = FALSE
)) %>%
select(-yr_renovated, -sqft_basement)
alias(lm(price ~ ., prices_clean))
Model :
price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors +
waterfront + view + condition + grade + sqft_above + yr_built +
lat + long + renovated + basement
prices_clean %>%
select(is.numeric) %>%
select(1:6) %>%
ggpairs()
prices_clean %>%
select(is.numeric) %>%
select(1, 7:11) %>%
ggpairs()
prices_clean %>%
select(1, !is.numeric) %>%
ggpairs()
Correlations with price:
mod1a <- lm(price ~ sqft_living, prices_clean)
mod1b <- lm(price ~ grade, prices_clean)
mod1c <- lm(price ~ sqft_above, prices_clean)
autoplot(mod1a)
autoplot(mod1b)
autoplot(mod1c)
summary(mod1a)
Call:
lm(formula = price ~ sqft_living, data = prices_clean)
Residuals:
Min 1Q Median 3Q Max
-1476062 -147486 -24043 106182 4362067
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -43580.743 4402.690 -9.899 <2e-16 ***
sqft_living 280.624 1.936 144.920 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 261500 on 21611 degrees of freedom
Multiple R-squared: 0.4929, Adjusted R-squared: 0.4928
F-statistic: 2.1e+04 on 1 and 21611 DF, p-value: < 2.2e-16
summary(mod1b)
Call:
lm(formula = price ~ grade, data = prices_clean)
Residuals:
Min 1Q Median 3Q Max
-816988 -151958 -36158 97842 6046097
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1056045 12256 -86.17 <2e-16 ***
grade 208458 1582 131.76 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 273400 on 21611 degrees of freedom
Multiple R-squared: 0.4455, Adjusted R-squared: 0.4454
F-statistic: 1.736e+04 on 1 and 21611 DF, p-value: < 2.2e-16
summary(mod1c)
Call:
lm(formula = price ~ sqft_above, data = prices_clean)
Residuals:
Min 1Q Median 3Q Max
-913132 -165624 -41468 109327 5339232
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 59953.2 4729.8 12.68 <2e-16 ***
sqft_above 268.5 2.4 111.87 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 292200 on 21611 degrees of freedom
Multiple R-squared: 0.3667, Adjusted R-squared: 0.3667
F-statistic: 1.251e+04 on 1 and 21611 DF, p-value: < 2.2e-16
The residuals vs fitted is best for c (sqft_above), but this has a
much poorer adjusted R^2 than the other models.
Sqft_living has the highest correlation and a good R^2.
prices_resid <- prices_clean %>%
add_residuals(mod1a) %>%
select(-price, -sqft_living)
prices_resid %>%
select(is.numeric) %>%
select(1:4, 10) %>%
ggpairs()
prices_resid %>%
select(is.numeric) %>%
select(5:10) %>%
ggpairs()
prices_resid %>%
select(15, !is.numeric) %>%
ggpairs()
Correlations:
mod2a <- lm(price ~ sqft_living + waterfront, prices_clean)
mod2b <- lm(price ~ sqft_living + renovated, prices_clean)
mod2c <- lm(price ~ sqft_living + basement, prices_clean)
mod2d <- lm(price ~ sqft_living + lat, prices_clean)
autoplot(mod2a)
autoplot(mod2b)
autoplot(mod2c)
autoplot(mod2d)
plot(mod2a)
prices_clean %>%
#slice(3915)
#slice(7253)
#slice(9255)
slice_max(price, n = 10)
summary(mod2a)
Call:
lm(formula = price ~ sqft_living + waterfront, data = prices_clean)
Residuals:
Min 1Q Median 3Q Max
-1376782 -142867 -21360 107201 4449253
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -32957.851 4242.971 -7.768 8.35e-15 ***
sqft_living 272.507 1.873 145.499 < 2e-16 ***
waterfrontTRUE 829983.104 19882.279 41.745 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 251500 on 21610 degrees of freedom
Multiple R-squared: 0.5307, Adjusted R-squared: 0.5307
F-statistic: 1.222e+04 on 2 and 21610 DF, p-value: < 2.2e-16
summary(mod2b)
Call:
lm(formula = price ~ sqft_living + renovated, data = prices_clean)
Residuals:
Min 1Q Median 3Q Max
-1447169 -146107 -23192 106324 4228135
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -46328.631 4372.000 -10.60 <2e-16 ***
sqft_living 278.693 1.925 144.80 <2e-16 ***
renovatedTRUE 159947.478 8783.478 18.21 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 259500 on 21610 degrees of freedom
Multiple R-squared: 0.5005, Adjusted R-squared: 0.5005
F-statistic: 1.083e+04 on 2 and 21610 DF, p-value: < 2.2e-16
summary(mod2c)
Call:
lm(formula = price ~ sqft_living + basement, data = prices_clean)
Residuals:
Min 1Q Median 3Q Max
-1457685 -146961 -22846 104553 4375783
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -48371.266 4440.011 -10.894 < 2e-16 ***
sqft_living 277.495 1.976 140.468 < 2e-16 ***
basementTRUE 28768.071 3715.285 7.743 1.01e-14 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 261100 on 21610 degrees of freedom
Multiple R-squared: 0.4943, Adjusted R-squared: 0.4942
F-statistic: 1.056e+04 on 2 and 21610 DF, p-value: < 2.2e-16
summary(mod2d)
Call:
lm(formula = price ~ sqft_living + lat, data = prices_clean)
Residuals:
Min 1Q Median 3Q Max
-1487994 -125643 -20309 84613 4368717
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.416e+07 5.653e+05 -60.44 <2e-16 ***
sqft_living 2.749e+02 1.794e+00 153.27 <2e-16 ***
lat 7.177e+05 1.189e+04 60.36 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 241900 on 21610 degrees of freedom
Multiple R-squared: 0.566, Adjusted R-squared: 0.566
F-statistic: 1.409e+04 on 2 and 21610 DF, p-value: < 2.2e-16
anova(mod1a, mod2a)
Analysis of Variance Table
Model 1: price ~ sqft_living
Model 2: price ~ sqft_living + waterfront
Res.Df RSS Df Sum of Sq F Pr(>F)
1 21611 1.4773e+15
2 21610 1.3670e+15 1 1.1024e+14 1742.6 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
anova(mod1a, mod2d)
Analysis of Variance Table
Model 1: price ~ sqft_living
Model 2: price ~ sqft_living + lat
Res.Df RSS Df Sum of Sq F Pr(>F)
1 21611 1.4773e+15
2 21610 1.2641e+15 1 2.1314e+14 3643.5 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Either a(waterfront) or d(lat) would be a good addition. lat seems to improve the model slightly more than waterfront so may be a good one to take forward. The residuals vs leverage for a is alos a little weird.
prices_resid <- prices_clean %>%
add_residuals(mod2d) %>%
select(-price, -sqft_living, -lat)
prices_resid %>%
select(is.numeric) %>%
select(1:4, 9) %>%
ggpairs()
prices_resid %>%
select(is.numeric) %>%
select(5:9) %>%
ggpairs()
prices_resid %>%
select(14, !is.numeric) %>%
ggpairs()
Correlations:
prices_clean %>%
filter(bedrooms > 30)
33 bedrooms with less than 2 bathrooms, on only 1 floor?? Surely not….
mod3a <- lm(price ~ sqft_living + lat + waterfront, prices_clean)
mod3b <- lm(price ~ sqft_living + lat + view, prices_clean)
mod3c <- lm(price ~ sqft_living + lat + yr_built, prices_clean)
autoplot(mod3a)
autoplot(mod3b)
autoplot(mod3c)
summary(mod3a)
Call:
lm(formula = price ~ sqft_living + lat + waterfront, data = prices_clean)
Residuals:
Min 1Q Median 3Q Max
-1386107 -119931 -17106 84803 4458444
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.465e+07 5.387e+05 -64.33 <2e-16 ***
sqft_living 2.665e+02 1.719e+00 155.07 <2e-16 ***
lat 7.282e+05 1.133e+04 64.27 <2e-16 ***
waterfrontTRUE 8.532e+05 1.822e+04 46.83 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 230500 on 21609 degrees of freedom
Multiple R-squared: 0.606, Adjusted R-squared: 0.6059
F-statistic: 1.108e+04 on 3 and 21609 DF, p-value: < 2.2e-16
summary(mod3b)
Call:
lm(formula = price ~ sqft_living + lat + view, data = prices_clean)
Residuals:
Min 1Q Median 3Q Max
-1795220 -113556 -14664 81290 4412462
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.419e+07 5.309e+05 -64.40 <2e-16 ***
sqft_living 2.499e+02 1.758e+00 142.14 <2e-16 ***
lat 7.189e+05 1.117e+04 64.38 <2e-16 ***
view1 1.583e+05 1.261e+04 12.55 <2e-16 ***
view2 1.289e+05 7.585e+03 17.00 <2e-16 ***
view3 2.289e+05 1.034e+04 22.13 <2e-16 ***
view4 6.172e+05 1.304e+04 47.35 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 227100 on 21606 degrees of freedom
Multiple R-squared: 0.6175, Adjusted R-squared: 0.6174
F-statistic: 5814 on 6 and 21606 DF, p-value: < 2.2e-16
summary(mod3c)
Call:
lm(formula = price ~ sqft_living + lat + yr_built, data = prices_clean)
Residuals:
Min 1Q Median 3Q Max
-1645242 -120261 -14570 85543 4077885
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.770e+07 5.929e+05 -46.72 <2e-16 ***
sqft_living 2.936e+02 1.861e+00 157.79 <2e-16 ***
lat 6.551e+05 1.182e+04 55.40 <2e-16 ***
yr_built -1.787e+03 5.875e+01 -30.42 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 236900 on 21609 degrees of freedom
Multiple R-squared: 0.5838, Adjusted R-squared: 0.5838
F-statistic: 1.011e+04 on 3 and 21609 DF, p-value: < 2.2e-16
anova(mod2d, mod3b)
Analysis of Variance Table
Model 1: price ~ sqft_living + lat
Model 2: price ~ sqft_living + lat + view
Res.Df RSS Df Sum of Sq F Pr(>F)
1 21610 1.2641e+15
2 21606 1.1141e+15 4 1.5e+14 727.23 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
anova(mod2d, mod3a)
Analysis of Variance Table
Model 1: price ~ sqft_living + lat
Model 2: price ~ sqft_living + lat + waterfront
Res.Df RSS Df Sum of Sq F Pr(>F)
1 21610 1.2641e+15
2 21609 1.1477e+15 1 1.1646e+14 2192.7 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
View appears to be the best predictor to add next
prices_resid <- prices_clean %>%
add_residuals(mod3b) %>%
select(-price, -sqft_living, -lat, -view)
prices_resid %>%
#select(is.numeric) %>%
select(1:6, 13) %>%
ggpairs()
prices_resid %>%
#select(is.numeric) %>%
select(7:13) %>%
ggpairs()
# prices_resid %>%
# select(14, !is.numeric) %>%
# ggpairs()
Correlations:
mod4a <- lm(price ~ sqft_living + lat + view + yr_built, prices_clean)
mod4b <- lm(price ~ sqft_living + lat + view + grade, prices_clean)
mod4c <- lm(price ~ sqft_living + lat + view + waterfront, prices_clean)
autoplot(mod4a)
autoplot(mod4b)
autoplot(mod4c)
summary(mod4a)
Call:
lm(formula = price ~ sqft_living + lat + view + yr_built, data = prices_clean)
Residuals:
Min 1Q Median 3Q Max
-1913360 -109916 -10491 83933 4210635
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.918e+07 5.621e+05 -51.92 <2e-16 ***
sqft_living 2.663e+02 1.860e+00 143.22 <2e-16 ***
lat 6.702e+05 1.119e+04 59.88 <2e-16 ***
view1 1.380e+05 1.247e+04 11.07 <2e-16 ***
view2 1.090e+05 7.525e+03 14.49 <2e-16 ***
view3 2.058e+05 1.025e+04 20.08 <2e-16 ***
view4 5.881e+05 1.291e+04 45.55 <2e-16 ***
yr_built -1.383e+03 5.637e+01 -24.53 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 224000 on 21605 degrees of freedom
Multiple R-squared: 0.6279, Adjusted R-squared: 0.6278
F-statistic: 5208 on 7 and 21605 DF, p-value: < 2.2e-16
summary(mod4b)
Call:
lm(formula = price ~ sqft_living + lat + view + grade, data = prices_clean)
Residuals:
Min 1Q Median 3Q Max
-1430046 -112420 -17585 77081 4765315
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.224e+07 5.144e+05 -62.69 <2e-16 ***
sqft_living 1.736e+02 2.541e+00 68.35 <2e-16 ***
lat 6.684e+05 1.084e+04 61.65 <2e-16 ***
view1 1.589e+05 1.216e+04 13.06 <2e-16 ***
view2 1.190e+05 7.319e+03 16.26 <2e-16 ***
view3 2.131e+05 9.985e+03 21.34 <2e-16 ***
view4 6.008e+05 1.258e+04 47.76 <2e-16 ***
grade 7.958e+04 1.976e+03 40.27 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 219000 on 21605 degrees of freedom
Multiple R-squared: 0.6442, Adjusted R-squared: 0.6441
F-statistic: 5589 on 7 and 21605 DF, p-value: < 2.2e-16
summary(mod4c)
Call:
lm(formula = price ~ sqft_living + lat + view + waterfront, data = prices_clean)
Residuals:
Min 1Q Median 3Q Max
-1575666 -113307 -14271 81725 4427530
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.453e+07 5.240e+05 -65.89 <2e-16 ***
sqft_living 2.503e+02 1.734e+00 144.32 <2e-16 ***
lat 7.260e+05 1.102e+04 65.87 <2e-16 ***
view1 1.563e+05 1.244e+04 12.56 <2e-16 ***
view2 1.242e+05 7.486e+03 16.59 <2e-16 ***
view3 2.088e+05 1.024e+04 20.39 <2e-16 ***
view4 3.916e+05 1.586e+04 24.69 <2e-16 ***
waterfrontTRUE 5.314e+05 2.186e+04 24.30 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 224000 on 21605 degrees of freedom
Multiple R-squared: 0.6277, Adjusted R-squared: 0.6276
F-statistic: 5204 on 7 and 21605 DF, p-value: < 2.2e-16
anova(mod3b, mod4b)
Analysis of Variance Table
Model 1: price ~ sqft_living + lat + view
Model 2: price ~ sqft_living + lat + view + grade
Res.Df RSS Df Sum of Sq F Pr(>F)
1 21606 1.1141e+15
2 21605 1.0363e+15 1 7.7805e+13 1622 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Final model: price ~ sqft_living + lat + view + grade
mod1log <- lm(ln_house_price ~ ln_sqft_living, prices_log)
mod1a <- lm(price ~ sqft_living, prices)
summary(mod1log)
Call:
lm(formula = ln_house_price ~ ln_sqft_living, data = prices_log)
Residuals:
Min 1Q Median 3Q Max
-1.10511 -0.29300 0.01262 0.25701 1.33011
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.729916 0.047062 143.0 <2e-16 ***
ln_sqft_living 0.836771 0.006223 134.5 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3886 on 21611 degrees of freedom
Multiple R-squared: 0.4555, Adjusted R-squared: 0.4555
F-statistic: 1.808e+04 on 1 and 21611 DF, p-value: < 2.2e-16
summary(mod1a)
Call:
lm(formula = price ~ sqft_living, data = prices)
Residuals:
Min 1Q Median 3Q Max
-1476062 -147486 -24043 106182 4362067
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -43580.743 4402.690 -9.899 <2e-16 ***
sqft_living 280.624 1.936 144.920 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 261500 on 21611 degrees of freedom
Multiple R-squared: 0.4929, Adjusted R-squared: 0.4928
F-statistic: 2.1e+04 on 1 and 21611 DF, p-value: < 2.2e-16
prices_clean %>%
#mutate(floors = as.factor(floors)) %>%
ggplot(aes(floors, price)) +
geom_point()
#geom_boxplot()
prices_factored <- prices_clean %>%
mutate(grade = as.factor(grade))
mod2fac <- lm(price ~ grade, prices_factored)
mod2unfac <- lm(price ~ grade, prices_clean)
summary(mod2fac)
Call:
lm(formula = price ~ grade, data = prices_factored)
Residuals:
Min 1Q Median 3Q Max
-1929615 -135853 -35090 89080 5565658
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 142000 254499 0.558 0.576878
grade3 63667 293870 0.217 0.828484
grade4 72381 258849 0.280 0.779767
grade5 106524 255024 0.418 0.676169
grade6 159920 254561 0.628 0.529868
grade7 260590 254513 1.024 0.305904
grade8 400853 254520 1.575 0.115285
grade9 631513 254547 2.481 0.013112 *
grade10 929771 254611 3.652 0.000261 ***
grade11 1354842 254817 5.317 1.07e-07 ***
grade12 2049222 255909 8.008 1.23e-15 ***
grade13 3567615 264106 13.508 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 254500 on 21601 degrees of freedom
Multiple R-squared: 0.5197, Adjusted R-squared: 0.5195
F-statistic: 2125 on 11 and 21601 DF, p-value: < 2.2e-16
summary(mod2unfac)
Call:
lm(formula = price ~ grade, data = prices_clean)
Residuals:
Min 1Q Median 3Q Max
-816988 -151958 -36158 97842 6046097
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1056045 12256 -86.17 <2e-16 ***
grade 208458 1582 131.76 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 273400 on 21611 degrees of freedom
Multiple R-squared: 0.4455, Adjusted R-squared: 0.4454
F-statistic: 1.736e+04 on 1 and 21611 DF, p-value: < 2.2e-16
#ggpairs(prices_clean, progress = FALSE) # This removes the text without altering the code chunk settings
#ggsave("ggpairs_1.png", width = 15, height = 15) # You can set dimensions for images
summary(mod2)
Call:
lm(formula = price ~ sqft_above + grade, data = prices_factored)
Residuals:
Min 1Q Median 3Q Max
-1845587 -136387 -34657 89917 5291001
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.212e+05 2.517e+05 0.482 0.6301
sqft_above 7.177e+01 3.244e+00 22.123 < 2e-16 ***
grade3 4.166e+04 2.906e+05 0.143 0.8860
grade4 4.629e+04 2.560e+05 0.181 0.8565
grade5 5.954e+04 2.522e+05 0.236 0.8134
grade6 1.040e+05 2.517e+05 0.413 0.6794
grade7 1.803e+05 2.517e+05 0.716 0.4738
grade8 2.876e+05 2.517e+05 1.143 0.2532
grade9 4.690e+05 2.518e+05 1.862 0.0626 .
grade10 7.272e+05 2.519e+05 2.886 0.0039 **
grade11 1.099e+06 2.522e+05 4.357 1.32e-05 ***
grade12 1.736e+06 2.535e+05 6.851 7.55e-12 ***
grade13 3.153e+06 2.618e+05 12.043 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 251700 on 21600 degrees of freedom
Multiple R-squared: 0.5303, Adjusted R-squared: 0.5301
F-statistic: 2033 on 12 and 21600 DF, p-value: < 2.2e-16
anova(mod1, mod2)
Analysis of Variance Table
Model 1: price ~ sqft_above
Model 2: price ~ sqft_above + grade
Res.Df RSS Df Sum of Sq F Pr(>F)
1 21611 1.8447e+15
2 21600 1.3681e+15 11 4.7663e+14 684.11 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
The whole of grade is significant and is an improvement even through not every individual grade is significant.